# Importing statements and display configurations
import sqlite3
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from IPython.display import display
# Set max_colwidth to None to display full contents of each cell
pd.set_option('display.max_colwidth', None)
# Loading data from csv file into a DataFrame
csv_file_path = 'csv_final_for_jupyter_notebook.csv'
df = pd.read_csv(csv_file_path, delimiter=',')
# Establishing connection to SQLite database
conn = sqlite3.connect(':memory:') # In-memory database for demonstration purposes
# Writing DataFrame to SQLite database, setting table name "final"
df.to_sql('final', conn, index=False)
1000
# Displaying the DataFrame head as a table using IPython display
display(df.head())
| F1 | Movie Name | Year | Duration | Rating | Metascore | Votes | Gross | Description | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | The Shawshank Redemption | 1994 | 142 | 9.3 | 81.0 | 34,709 | 28.34 | Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency. |
| 1 | 1 | The Godfather | 1972 | 175 | 9.2 | 100.0 | 34,709 | 134.97 | The aging patriarch of an organized crime dynasty in postwar New York City transfers control of his clandestine empire to his reluctant youngest son. |
| 2 | 2 | The Dark Knight | 2008 | 152 | 9.0 | 84.0 | 34,709 | 534.86 | When the menace known as the Joker wreaks havoc and chaos on the people of Gotham, Batman must accept one of the greatest psychological and physical tests of his ability to fight injustice. |
| 3 | 3 | The Lord of the Rings: The Return of the King | 2003 | 201 | 9.0 | 94.0 | 34,709 | 377.85 | Gandalf and Aragorn lead the World of Men against Sauron's army to draw his gaze from Frodo and Sam as they approach Mount Doom with the One Ring. |
| 4 | 4 | Schindler's List | 1993 | 195 | 9.0 | 94.0 | 34,709 | 96.90 | In German-occupied Poland during World War II, industrialist Oskar Schindler gradually becomes concerned for his Jewish workforce after witnessing their persecution by the Nazis. |
# General Info
display(df.info())
# Basic statistics
display('General Description', df.describe())
# Shape of the dataframe
display("The shape of the dataframe", df.shape)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1000 entries, 0 to 999 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 F1 1000 non-null int64 1 Movie Name 1000 non-null object 2 Year 1000 non-null int64 3 Duration 1000 non-null int64 4 Rating 1000 non-null float64 5 Metascore 836 non-null float64 6 Votes 1000 non-null object 7 Gross 817 non-null float64 8 Description 1000 non-null object dtypes: float64(3), int64(3), object(3) memory usage: 70.4+ KB
None
'General Description'
| F1 | Year | Duration | Rating | Metascore | Gross | |
|---|---|---|---|---|---|---|
| count | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 | 836.000000 | 817.000000 |
| mean | 499.500000 | 1991.190000 | 123.747000 | 7.962700 | 78.686603 | 70.132497 |
| std | 288.819436 | 23.924972 | 28.468816 | 0.276759 | 12.052971 | 113.765949 |
| min | 0.000000 | 1920.000000 | 45.000000 | 7.600000 | 28.000000 | 0.000000 |
| 25% | 249.750000 | 1975.000000 | 103.000000 | 7.700000 | 71.000000 | 3.270000 |
| 50% | 499.500000 | 1999.000000 | 120.000000 | 7.900000 | 80.000000 | 23.650000 |
| 75% | 749.250000 | 2010.000000 | 138.000000 | 8.100000 | 88.000000 | 85.080000 |
| max | 999.000000 | 2022.000000 | 321.000000 | 9.300000 | 100.000000 | 936.660000 |
'The shape of the dataframe'
(1000, 9)
# Creating a cursor object for querying data
cursor = conn.cursor()
# Find the average of Duration in min, Metascore, Rating, Gross in millions
query = """
SELECT
AVG(Duration) As Average_Duration_min,
ROUND(
AVG(Metascore),
2
) As Average_Metascore --Two decimal places
,
ROUND(
AVG(Rating),
2
) As Average_Rating --Two decimal places
,
ROUND(
AVG(Gross),
2
) As Average_Gross_mill -- millions with two decimal places
FROM
final
"""
# Fetching results into a DataFrame
results_df = pd.read_sql_query(query, conn)
# Displaying the DataFrame as a table using IPython display
display(results_df)
| Average_Duration_min | Average_Metascore | Average_Rating | Average_Gross_mill | |
|---|---|---|---|---|
| 0 | 123.747 | 78.69 | 7.96 | 70.13 |
# Counting the number of movies released after 1999, grouped by year
query = """
SELECT
[Year],
COUNT(*) AS movies_number
FROM
final
WHERE
[Year] >= 2000
GROUP BY
[Year]
ORDER BY
[Year] ASC
"""
results_df = pd.read_sql_query(query, conn)
display(results_df)
# Plotting the distribution of movies released in every year
fig = px.bar(
results_df,
x='Year',
y='movies_number',
title="Number of Movies Released by Year",
color_discrete_sequence=['green'],
width=1000,
height=500
)
fig.show()
| Year | movies_number | |
|---|---|---|
| 0 | 2000 | 18 |
| 1 | 2001 | 26 |
| 2 | 2002 | 16 |
| 3 | 2003 | 22 |
| 4 | 2004 | 32 |
| 5 | 2005 | 17 |
| 6 | 2006 | 25 |
| 7 | 2007 | 25 |
| 8 | 2008 | 20 |
| 9 | 2009 | 26 |
| 10 | 2010 | 21 |
| 11 | 2011 | 17 |
| 12 | 2012 | 23 |
| 13 | 2013 | 27 |
| 14 | 2014 | 32 |
| 15 | 2015 | 24 |
| 16 | 2016 | 28 |
| 17 | 2017 | 17 |
| 18 | 2018 | 21 |
| 19 | 2019 | 26 |
| 20 | 2020 | 8 |
| 21 | 2021 | 10 |
| 22 | 2022 | 7 |
# Top 10 higher rated movies
query = """
SELECT
[Movie Name],
[Rating]
FROM
final
ORDER BY
[Rating] DESC
LIMIT 10
"""
results_df = pd.read_sql_query(query, conn)
display(results_df)
results_df = results_df.iloc[::-1]# reversing the dataframe for plotly bar
fig = px.bar(
results_df,
x='Rating',
y='Movie Name',
color='Rating',
text_auto=True,
title='Top 10 rated Movies',
width =800,
height=500
)
fig.show()
| Movie Name | Rating | |
|---|---|---|
| 0 | The Shawshank Redemption | 9.3 |
| 1 | The Godfather | 9.2 |
| 2 | The Dark Knight | 9.0 |
| 3 | The Lord of the Rings: The Return of the King | 9.0 |
| 4 | Schindler's List | 9.0 |
| 5 | The Godfather: Part II | 9.0 |
| 6 | 12 Angry Men | 9.0 |
| 7 | Jai Bhim | 8.9 |
| 8 | Pulp Fiction | 8.9 |
| 9 | Inception | 8.8 |
# Top 10 higher grossing movies
query = """
SELECT
[Movie Name],
[Gross]
FROM
final
ORDER BY
[Gross] DESC
LIMIT 10
"""
results_df = pd.read_sql(query,conn)
display(results_df)
results_df =results_df.iloc[::-1]# reversing the dataframe for plotly bar
fig = px.bar(
results_df,
x='Gross',
y='Movie Name',
color='Gross',
text_auto=True,
title='Top 10 Gross Movies',
width =800,
height=500
)
fig.show()
| Movie Name | Gross | |
|---|---|---|
| 0 | Star Wars: Episode VII - The Force Awakens | 936.66 |
| 1 | Avengers: Endgame | 858.37 |
| 2 | Spider-Man: No Way Home | 804.75 |
| 3 | Avatar | 760.51 |
| 4 | Avengers: Infinity War | 678.82 |
| 5 | Titanic | 659.33 |
| 6 | The Avengers | 623.28 |
| 7 | Incredibles 2 | 608.58 |
| 8 | The Dark Knight | 534.86 |
| 9 | Rogue One: A Star Wars Story | 532.18 |
# Rating of Top 10 higher metascore movies
query = """
SELECT
[Movie Name],
[Metascore],
[Rating]
FROM
final
ORDER BY
[Gross] DESC
LIMIT 10
"""
results_df = pd.read_sql(query,conn)
display(results_df)
fig = px.bar(
results_df,
x='Metascore',
y='Movie Name',
color='Rating',
text_auto=True,
title=' Rating f Top 10 Metascore Movies',
width =800,
height=500
)
fig.show()
| Movie Name | Metascore | Rating | |
|---|---|---|---|
| 0 | Star Wars: Episode VII - The Force Awakens | 80.0 | 7.8 |
| 1 | Avengers: Endgame | 78.0 | 8.4 |
| 2 | Spider-Man: No Way Home | 71.0 | 8.3 |
| 3 | Avatar | 83.0 | 7.8 |
| 4 | Avengers: Infinity War | 68.0 | 8.4 |
| 5 | Titanic | 75.0 | 7.9 |
| 6 | The Avengers | 69.0 | 8.0 |
| 7 | Incredibles 2 | 80.0 | 7.6 |
| 8 | The Dark Knight | 84.0 | 9.0 |
| 9 | Rogue One: A Star Wars Story | 65.0 | 7.8 |
# List of long movies (duration >=180 min)
query ="""
SELECT
[Movie Name],
[Duration]
FROM
final
WHERE
[Duration] >= 180
ORDER BY
[Duration] DESC
"""
results_df = pd.read_sql(query, conn)
display(results_df)
| Movie Name | Duration | |
|---|---|---|
| 0 | Gangs of Wasseypur | 321 |
| 1 | Zack Snyder's Justice League | 242 |
| 2 | Hamlet | 242 |
| 3 | Gone with the Wind | 238 |
| 4 | Once Upon a Time in America | 229 |
| 5 | Lagaan: Once Upon a Time in India | 224 |
| 6 | The Ten Commandments | 220 |
| 7 | Lawrence of Arabia | 218 |
| 8 | Ben-Hur | 212 |
| 9 | The Irishman | 209 |
| 10 | Seven Samurai | 207 |
| 11 | Andrei Rublev | 205 |
| 12 | The Godfather: Part II | 202 |
| 13 | Malcolm X | 202 |
| 14 | The Lord of the Rings: The Return of the King | 201 |
| 15 | Doctor Zhivago | 197 |
| 16 | Spartacus | 197 |
| 17 | Winter Sleep | 196 |
| 18 | Schindler's List | 195 |
| 19 | Titanic | 194 |
| 20 | The Right Stuff | 193 |
| 21 | Veer-Zaara | 192 |
| 22 | Gandhi | 191 |
| 23 | The Green Mile | 189 |
| 24 | Swades | 189 |
| 25 | JFK | 189 |
| 26 | Fanny and Alexander | 188 |
| 27 | Magnolia | 188 |
| 28 | Short Cuts | 188 |
| 29 | RRR | 187 |
| 30 | Bhaag Milkha Bhaag | 186 |
| 31 | The Leopard | 186 |
| 32 | Kal Ho Naa Ho | 186 |
| 33 | Barry Lyndon | 185 |
| 34 | M.S. Dhoni: The Untold Story | 184 |
| 35 | Dil Chahta Hai | 183 |
| 36 | The Deer Hunter | 183 |
| 37 | Avengers: Endgame | 181 |
| 38 | Dilwale Dulhania Le Jayenge | 181 |
| 39 | Dances with Wolves | 181 |
| 40 | Fiddler on the Roof | 181 |
| 41 | The Wolf of Wall Street | 180 |
| 42 | Blood In, Blood Out | 180 |
| 43 | Blue Is the Warmest Colour | 180 |
# Top 10 higher grossing police movies by description
query = """
SELECT
[Movie Name],
[Gross],
[Description]
FROM
final
WHERE
(
[Description] LIKE '%police%'
OR [Description] LIKE '%thief%'
OR [Description] LIKE '%robber%'
) -- Police movies
AND [Gross] IS NOT NULL --Not including NULL Gross values
ORDER BY
[Gross] DESC
LIMIT 10
"""
results_df = pd.read_sql(query, conn)
display(results_df)
fig=px.bar(
results_df,
x='Movie Name',
y='Gross',
color='Gross',
title='Top 10 higher grossing police movies',
text_auto=True,
width=800,
height=500
)
fig.show()
| Movie Name | Gross | Description | |
|---|---|---|---|
| 0 | Inception | 292.58 | A thief who steals corporate secrets through the use of dream-sharing technology is given the inverse task of planting an idea into the mind of a C.E.O., but his tragic past may doom the project and his team to disaster. |
| 1 | Tangled | 200.82 | The magically long-haired Rapunzel has spent her entire life in a tower, but now that a runaway thief has stumbled upon her, she is about to discover the world for the first time, and who she really is. |
| 2 | As Good as It Gets | 148.48 | A single mother and waitress, a misanthropic author, and a gay artist form an unlikely friendship after the artist is assaulted in a robbery. |
| 3 | The Departed | 132.38 | An undercover cop and a mole in the police attempt to identify each other while infiltrating an Irish gang in South Boston. |
| 4 | Minority Report | 132.07 | In a future where a special police unit is able to arrest murderers before they commit their crimes, an officer from that unit is himself accused of a future murder. |
| 5 | Butch Cassidy and the Sundance Kid | 102.31 | Wyoming, early 1900s. Butch Cassidy and The Sundance Kid are the leaders of a band of outlaws. After a train robbery goes wrong they find themselves on the run with a posse hard on their heels. Their solution - escape to Bolivia. |
| 6 | Inside Man | 88.51 | A police detective, a bank robber, and a high-power broker enter high-stakes negotiations after the criminal's brilliant heist spirals into a hostage situation. |
| 7 | The Naked Gun: From the Files of Police Squad! | 78.76 | Incompetent police Detective Frank Drebin must foil an attempt to assassinate Queen Elizabeth II. |
| 8 | L.A. Confidential | 64.62 | As corruption grows in 1950s Los Angeles, three policemen - one strait-laced, one brutal, and one sleazy - investigate a series of murders with their own brand of justice. |
| 9 | Prisoners | 61.00 | When Keller Dover's daughter and her friend go missing, he takes matters into his own hands as the police pursue multiple leads and the pressure mounts. |
# Top 10 higher Metascore crime movies by description
query = """
SELECT
[Metascore],
[Movie Name],
[Description]
FROM
final
WHERE
(
[Description] LIKE '%murder%'
OR [Description] LIKE '%crime%'
OR [Description] LIKE '%killer%'
) -- Crime movies
AND [Metascore] IS NOT NULL --Not including NULL Metascore values
ORDER BY
[Metascore] DESC
LIMIT 10
"""
results_df = pd.read_sql(query, conn)
display(results_df)
fig=px.bar(
results_df,
x='Movie Name',
y='Metascore',
color='Metascore',
title='Top 10 higher Metascore crime movies',
text_auto=True,
width=800,
height=500
)
fig.show()
| Metascore | Movie Name | Description | |
|---|---|---|---|
| 0 | 100.0 | The Godfather | The aging patriarch of an organized crime dynasty in postwar New York City transfers control of his clandestine empire to his reluctant youngest son. |
| 1 | 100.0 | Rear Window | A wheelchair-bound photographer spies on his neighbors from his Greenwich Village courtyard apartment window, and becomes convinced one of them has committed murder, despite the skepticism of his fashion-model girlfriend. |
| 2 | 99.0 | Touch of Evil | A stark, perverse story of murder, kidnapping, and police corruption in a Mexican border town. |
| 3 | 98.0 | Rashomon | The rape of a bride and the murder of her samurai husband are recalled from the perspectives of a bandit, the bride, the samurai's ghost and a woodcutter. |
| 4 | 97.0 | Rififi | Four men plan a technically perfect crime, but the human element intervenes... |
| 5 | 97.0 | The Maltese Falcon | San Francisco private detective Sam Spade takes on a case that involves him with three eccentric criminals, a gorgeous liar, and their quest for a priceless statuette, with the stakes rising after his partner is murdered. |
| 6 | 96.0 | 12 Angry Men | The jury in a New York City murder trial is frustrated by a single member whose skeptical caution forces them to more carefully consider the evidence before jumping to a hasty verdict. |
| 7 | 95.0 | Double Indemnity | A Los Angeles insurance representative lets an alluring housewife seduce him into a scheme of insurance fraud and murder that arouses the suspicion of his colleague, an insurance investigator. |
| 8 | 94.0 | Elevator to the Gallows | A self-assured businessman murders his employer, the husband of his mistress, which unintentionally provokes an ill-fated chain of events. |
| 9 | 94.0 | The Irishman | Hitman Frank Sheeran looks back at the secrets he kept as a loyal member of the Bufalino crime family. |
# List of World war two movies by description
query = """
SELECT
[Year],
[Movie Name],
[Rating],
[Description]
FROM
final
WHERE
(
[Description] LIKE '%nazi%'
OR [Description] LIKE '%world war 2%'
OR [Description] LIKE '%world war two%'
)
AND [Description] NOT LIKE '%neo-nazi%' --Not including post war neo_nazi movies
ORDER BY
[Rating] DESC
"""
results_df = pd.read_sql(query, conn)
display(results_df)
fig=px.scatter(
results_df,
x='Year',
y='Rating',
color='Movie Name',
width=800,
height=500
)
# Updating the marker size
fig.update_traces(marker=dict(size=results_df['Rating']*2,
line=dict(width=2,
color='SandyBrown')),
selector=dict(mode='markers'))
fig.show()
| Year | Movie Name | Rating | Description | |
|---|---|---|---|---|
| 0 | 1993 | Schindler's List | 9.0 | In German-occupied Poland during World War II, industrialist Oskar Schindler gradually becomes concerned for his Jewish workforce after witnessing their persecution by the Nazis. |
| 1 | 1942 | Casablanca | 8.5 | A cynical expatriate American cafe owner struggles to decide whether or not to help his former lover and her fugitive husband escape the Nazis in French Morocco. |
| 2 | 1981 | Indiana Jones and the Raiders of the Lost Ark | 8.4 | In 1936, archaeologist and adventurer Indiana Jones is hired by the U.S. government to find the Ark of the Covenant before Adolf Hitler's Nazis can obtain its awesome powers. |
| 3 | 2009 | Inglourious Basterds | 8.3 | In Nazi-occupied France during World War II, a plan to assassinate Nazi leaders by a group of Jewish U.S. soldiers coincides with a theatre owner's vengeful plans for the same. |
| 4 | 1961 | Judgment at Nuremberg | 8.3 | In 1948, an American court in occupied Germany tries four Nazis judged for war crimes. |
| 5 | 2004 | Downfall | 8.2 | Traudl Junge, the final secretary for Adolf Hitler, tells of the Nazi dictator's final days in his Berlin bunker at the end of WWII. |
| 6 | 1989 | Indiana Jones and the Last Crusade | 8.2 | In 1938, after his father Professor Henry Jones, Sr. goes missing while pursuing the Holy Grail, Professor Henry "Indiana" Jones, Jr. finds himself up against Adolf Hitler's Nazis again to stop them from obtaining its powers. |
| 7 | 1942 | To Be or Not to Be | 8.2 | During the Nazi occupation of Poland, an acting troupe becomes embroiled in a Polish soldier's efforts to track down a German spy. |
| 8 | 1945 | Rome, Open City | 8.0 | During the Nazi occupation of Rome in 1944, the Resistance leader, Giorgio Manfredi, is chased by the Nazis as he seeks refuge and a way to escape. |
| 9 | 1946 | Notorious | 7.9 | The daughter of a convicted Nazi spy is asked by American agents to gather information on a ring of Nazi scientists in South America. How far will she have to go to ingratiate herself with them? |
| 10 | 1972 | Cabaret | 7.8 | A female girlie club entertainer in Weimar Republic era Berlin romances two men while the Nazi Party rises to power around them. |
| 11 | 2006 | Black Book | 7.7 | In the Nazi-occupied Netherlands during World War II, a Jewish singer infiltrates the regional Gestapo headquarters for the Dutch resistance. |
| 12 | 1968 | Where Eagles Dare | 7.6 | Allied agents stage a daring raid on a castle where the Nazis are holding American brigadier general George Carnaby prisoner, but that's not all that's really going on. |
# Grouping the data by 'Year' and counting the occurrences
year_counts= df['Year'].value_counts().reset_index()
year_counts.columns = ['Year', 'Count']
# Sorting the dataframe by 'Year'
year_counts = year_counts.sort_values(by='Year')
# Plotting the distribution of movies released in every year
fig = px.bar(
year_counts,
x='Year',
y='Count',
title="Number of Movies Released by Year",
color_discrete_sequence=['green'],
width=1000,
height=500
)
fig.show()
fig =px.scatter(
year_counts,
x='Count',
y='Year',
color='Year',
title="Number of Movies Released by Year",
width=800,
height=500
)
# Updating the marker size
fig.update_traces(marker=dict(size=year_counts['Count'],
line=dict(width=2,
color='SandyBrown')),
selector=dict(mode='markers'))
fig.show()
# Ploting histogram of metascore distribution including a box plot
fig = px.histogram(
df,
x='Metascore',
nbins=20,
text_auto=True,
title='Metascore distribution',
color_discrete_sequence=['blue'],
width=1000,
height=500,
marginal='box'
)
fig.show()
# Ploting pie chart of gross distribution
# Define the bin edges
bin_edges = [0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
# Define bin labels
bin_labels = ['0-100', '101-200', '201-300', '301-400', '401-500',
'501-600', '601-700', '701-800', '801-900', '901-1000']
# Classify gross values into bins
df['Gross_Class'] = pd.cut(df['Gross'], bins=bin_edges, labels=bin_labels, right=False)
# Calculate the count of movies in each gross class
gross_class_counts = df['Gross_Class'].value_counts().reset_index()
gross_class_counts.columns = ['Gross_Class', 'Count']
# Plotting the pie chart
fig = px.pie(
gross_class_counts,
names='Gross_Class',
values='Count',
title='Gross Distribution (0-1000) millions',
width=800,
height=500,
hole=0.5
)
fig.show()
#Insert a new decade column for further decade analysis
# Add a new column of name Decade and type int
alter_query="""
ALTER TABLE final
ADD COLUMN Decade INT
"""
# Execute alter query
cursor.execute(alter_query)
# Define the values of the new column
update_query="""
UPDATE final
SET Decade = (ROUND(Year/10)*10)
"""
# Execute update query
cursor.execute(update_query)
# Commiting the changes to the database
conn.commit()
# Verifying the changes by fetching the data from the updated table
query = """
SELECT * FROM final
"""
updated_df = pd.read_sql_query(query, conn)
# Display the updated DataFrame
display(updated_df.head())
display(updated_df.dtypes)
| F1 | Movie Name | Year | Duration | Rating | Metascore | Votes | Gross | Description | Decade | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | The Shawshank Redemption | 1994 | 142 | 9.3 | 81.0 | 34,709 | 28.34 | Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency. | 1990 |
| 1 | 1 | The Godfather | 1972 | 175 | 9.2 | 100.0 | 34,709 | 134.97 | The aging patriarch of an organized crime dynasty in postwar New York City transfers control of his clandestine empire to his reluctant youngest son. | 1970 |
| 2 | 2 | The Dark Knight | 2008 | 152 | 9.0 | 84.0 | 34,709 | 534.86 | When the menace known as the Joker wreaks havoc and chaos on the people of Gotham, Batman must accept one of the greatest psychological and physical tests of his ability to fight injustice. | 2000 |
| 3 | 3 | The Lord of the Rings: The Return of the King | 2003 | 201 | 9.0 | 94.0 | 34,709 | 377.85 | Gandalf and Aragorn lead the World of Men against Sauron's army to draw his gaze from Frodo and Sam as they approach Mount Doom with the One Ring. | 2000 |
| 4 | 4 | Schindler's List | 1993 | 195 | 9.0 | 94.0 | 34,709 | 96.90 | In German-occupied Poland during World War II, industrialist Oskar Schindler gradually becomes concerned for his Jewish workforce after witnessing their persecution by the Nazis. | 1990 |
F1 int64 Movie Name object Year int64 Duration int64 Rating float64 Metascore float64 Votes object Gross float64 Description object Decade int64 dtype: object
# Grouping the data by 'Decade' and counting the occurrences
decade_counts= updated_df['Decade'].value_counts().reset_index()
decade_counts.columns = ['Decade', 'Count']
# Sorting the dataframe by 'Decade'
decade_counts = decade_counts.sort_values(by='Decade')
# Plotting the donut chart
fig = px.pie(
decade_counts,
names='Decade',
values='Count',
title='Percentage of Movies Released by Decade',
width=800,
height=500,
hole=0.6 # Set the size of the hole to create the donut shape
)
fig.show()
# Average values by Decade
query = """
SELECT
Decade,
ROUND(
AVG(Duration),
0
) As Average_Duration_min,
ROUND(
AVG(Metascore),
2
) As Average_Metascore --Two decimal places
,
ROUND(
AVG(Rating),
2
) As Average_Rating --two decimal places
,
ROUND(
AVG(Gross),
2
) As Average_Gross_mill -- millions with two decimal places
FROM
final
GROUP BY Decade
ORDER BY Decade ASC
"""
# Fetch results into a DataFrame
results_df = pd.read_sql_query(query, conn)
# Display the DataFrame as a table using IPython display
display(results_df)
| Decade | Average_Duration_min | Average_Metascore | Average_Rating | Average_Gross_mill | |
|---|---|---|---|---|---|
| 0 | 1920 | 85.0 | 95.00 | 8.14 | 1.85 |
| 1 | 1930 | 102.0 | 91.00 | 7.94 | 28.59 |
| 2 | 1940 | 109.0 | 92.52 | 8.02 | 7.91 |
| 3 | 1950 | 116.0 | 89.76 | 8.06 | 12.86 |
| 4 | 1960 | 128.0 | 83.59 | 7.96 | 31.35 |
| 5 | 1970 | 124.0 | 83.46 | 7.99 | 53.37 |
| 6 | 1980 | 122.0 | 76.89 | 7.96 | 60.70 |
| 7 | 1990 | 124.0 | 75.10 | 7.98 | 63.97 |
| 8 | 2000 | 124.0 | 74.56 | 7.91 | 72.56 |
| 9 | 2010 | 129.0 | 77.41 | 7.93 | 107.47 |
| 10 | 2020 | 145.0 | 77.61 | 8.17 | 320.26 |
# Average decade gross vs total gross
fig = go.Figure()
# Adding bar plot of average Gross by decade
fig.add_trace(trace=go.Bar(
x=results_df['Decade'],
y=results_df['Average_Gross_mill'],
text=results_df['Average_Gross_mill'],
marker_color=results_df['Average_Gross_mill'],
name='Averege Gross'
))
# Adding the line of the total average Gross 7.13 millions
fig.add_trace(trace=go.Scatter(
x=[1915, 2025],
y=[7.13, 7.13],
mode='lines',
marker_color='black',
name='Total average Gross line = 7.13 millions'
))
# Setting the general title and the size of the chart
fig.update_layout(title='Average Gross by Decade',
width=1000,
height=500,
)
# Setting X-axis title and angle for decades display
fig.update_xaxes(title='Decade', tickangle = 45)
fig.show()